Dataset Preprocessing¶

Library loading¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import matplotlib as plt


warnings.filterwarnings('ignore')
In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3
figure.dpi 50
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor white
dotSize 20
In [3]:
DSname="UpD50"
DSDirName="Sample_S20272_157"

Configure paths¶

In [4]:
outdir = "../data/output"
if not os.path.exists(outdir):
   # Create a new directory because it does not exist
   os.makedirs(outdir)
    
if not os.path.exists("../data/output/adatas"):
   # Create a new directory because it does not exist
   os.makedirs("../data/output/adatas")

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'

Dataset loading Downstream D50¶

In [5]:
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S20272_157-filtered_feature_bc_matrix-matrix.h5ad
In [6]:
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()

Importing cellIDs¶

In [7]:
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
In [8]:
cellID.Consensus.unique()
Out[8]:
array(['809', '3391B', 'KOLF', 'MIFF1', 'doublet', 'LowQuality'],
      dtype=object)
In [9]:
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
In [10]:
scanpyObj.obs
Out[10]:
dataset cellID
AAACCTGAGATGTGGC-1_UpD50 UpD50 809
AAACCTGAGCAACGGT-1_UpD50 UpD50 doublet
AAACCTGAGCGGATCA-1_UpD50 UpD50 3391B
AAACCTGAGCGTAATA-1_UpD50 UpD50 3391B
AAACCTGAGTCGCCGT-1_UpD50 UpD50 doublet
... ... ...
TTTGTCATCGCCTGTT-1_UpD50 UpD50 3391B
TTTGTCATCGCGTAGC-1_UpD50 UpD50 3391B
TTTGTCATCGGTGTTA-1_UpD50 UpD50 809
TTTGTCATCGTTTATC-1_UpD50 UpD50 809
TTTGTCATCTCTTGAT-1_UpD50 UpD50 809

17100 rows × 2 columns

Configure colors¶

In [11]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
In [12]:
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
Out[12]:
['#DBB807', '#FF0054', '#0FB248', '#a8a5a5', '#7B00FF', '#403b3b']

Preprocessing¶

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [13]:
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
    finished (0:00:00)

Subsetting according to good barcodes¶

In [14]:
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]

QC metrices¶

In [15]:
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP')  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
In [16]:
scanpyObj
Out[16]:
AnnData object with n_obs × n_vars = 10586 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
    uns: 'cellID_colors', 'cellID_newName_colors'
In [17]:
scanpyObj.var
Out[17]:
gene_ids feature_types mt n_cells_by_counts mean_counts log1p_mean_counts pct_dropout_by_counts total_counts log1p_total_counts ribo
MIR1302-2HG ENSG00000243485 Gene Expression False 2 0.000189 0.000189 99.981107 2.0 1.098612 False
FAM138A ENSG00000237613 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
OR4F5 ENSG00000186092 Gene Expression False 1 0.000094 0.000094 99.990554 1.0 0.693147 False
AL627309.1 ENSG00000238009 Gene Expression False 14 0.001323 0.001322 99.867750 14.0 2.708050 False
AL627309.3 ENSG00000239945 Gene Expression False 3 0.000283 0.000283 99.971661 3.0 1.386294 False
... ... ... ... ... ... ... ... ... ... ...
AC233755.2 ENSG00000277856 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC233755.1 ENSG00000275063 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC240274.1 ENSG00000271254 Gene Expression False 419 0.041375 0.040542 96.041942 438.0 6.084499 False
AC213203.1 ENSG00000277475 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM231C ENSG00000268674 Gene Expression False 1 0.000094 0.000094 99.990554 1.0 0.693147 False

33538 rows × 10 columns

In [18]:
scanpyObj.obs
Out[18]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGATGTGGC-1_UpD50 UpD50 809 CTL04E 1802 7.497207 4326.0 8.372630 122.0 4.812184 2.820157 1123.0 7.024649 25.959316
AAACCTGAGCGGATCA-1_UpD50 UpD50 3391B CTL01 2139 7.668561 7216.0 8.884194 68.0 4.234107 0.942350 2753.0 7.920810 38.151329
AAACCTGAGTGCCAGA-1_UpD50 UpD50 809 CTL04E 1222 7.109062 2554.0 7.845808 17.0 2.890372 0.665623 566.0 6.340359 22.161316
AAACCTGCAAGCGCTC-1_UpD50 UpD50 809 CTL04E 1568 7.358194 4419.0 8.393895 45.0 3.828641 1.018330 997.0 6.905753 22.561665
AAACCTGCACACATGT-1_UpD50 UpD50 3391B CTL01 3299 8.101678 8716.0 9.073030 35.0 3.583519 0.401560 1379.0 7.229839 15.821478
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCATCGCCTGTT-1_UpD50 UpD50 3391B CTL01 1361 7.216709 2898.0 7.972121 20.0 3.044523 0.690131 628.0 6.444131 21.670118
TTTGTCATCGCGTAGC-1_UpD50 UpD50 3391B CTL01 1203 7.093405 2296.0 7.739359 11.0 2.484907 0.479094 547.0 6.306275 23.824041
TTTGTCATCGGTGTTA-1_UpD50 UpD50 809 CTL04E 849 6.745236 1451.0 7.280697 17.0 2.890372 1.171606 179.0 5.192957 12.336320
TTTGTCATCGTTTATC-1_UpD50 UpD50 809 CTL04E 2815 7.943073 8304.0 9.024613 171.0 5.147494 2.059248 2092.0 7.646354 25.192678
TTTGTCATCTCTTGAT-1_UpD50 UpD50 809 CTL04E 1829 7.512071 3861.0 8.258941 184.0 5.220356 4.765605 473.0 6.161207 12.250712

10586 rows × 13 columns

In [19]:
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [20]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [21]:
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
In [22]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [23]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [24]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True)
In [25]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [26]:
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
In [27]:
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['MALAT1']
    finished (0:00:00)
In [28]:
sc.pp.log1p(scanpyObj)
In [29]:
scanpyObj.raw = scanpyObj

Subset according to JointHVGs and Filtered Barcodes from previous step¶

In [30]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]

scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
In [31]:
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
In [32]:
#scanpyObj = scanpyObj[:, HVG.tolist()]

#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
In [33]:
#sc.pl.highly_variable_genes(scanpyObj)

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [34]:
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:27)

Scale each gene to unit variance. Clip values exceeding standard deviation 10.

In [35]:
sc.pp.scale(scanpyObj, max_value=20)
In [36]:
scanpyObj.obs
Out[36]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGATGTGGC-1_UpD50 UpD50 809 CTL04E 1802 7.497207 4326.0 8.372630 122.0 4.812184 2.820157 1123.0 7.024649 25.959316
AAACCTGAGCGGATCA-1_UpD50 UpD50 3391B CTL01 2139 7.668561 7216.0 8.884194 68.0 4.234107 0.942350 2753.0 7.920810 38.151329
AAACCTGAGTGCCAGA-1_UpD50 UpD50 809 CTL04E 1222 7.109062 2554.0 7.845808 17.0 2.890372 0.665623 566.0 6.340359 22.161316
AAACCTGCAAGCGCTC-1_UpD50 UpD50 809 CTL04E 1568 7.358194 4419.0 8.393895 45.0 3.828641 1.018330 997.0 6.905753 22.561665
AAACCTGCACACATGT-1_UpD50 UpD50 3391B CTL01 3299 8.101678 8716.0 9.073030 35.0 3.583519 0.401560 1379.0 7.229839 15.821478
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCATCGCCTGTT-1_UpD50 UpD50 3391B CTL01 1361 7.216709 2898.0 7.972121 20.0 3.044523 0.690131 628.0 6.444131 21.670118
TTTGTCATCGCGTAGC-1_UpD50 UpD50 3391B CTL01 1203 7.093405 2296.0 7.739359 11.0 2.484907 0.479094 547.0 6.306275 23.824041
TTTGTCATCGGTGTTA-1_UpD50 UpD50 809 CTL04E 849 6.745236 1451.0 7.280697 17.0 2.890372 1.171606 179.0 5.192957 12.336320
TTTGTCATCGTTTATC-1_UpD50 UpD50 809 CTL04E 2815 7.943073 8304.0 9.024613 171.0 5.147494 2.059248 2092.0 7.646354 25.192678
TTTGTCATCTCTTGAT-1_UpD50 UpD50 809 CTL04E 1829 7.512071 3861.0 8.258941 184.0 5.220356 4.765605 473.0 6.161207 12.250712

10586 rows × 13 columns

Principal component analysis¶

In [37]:
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:04)
In [38]:
sc.pl.pca(scanpyObj, color='MKI67')
In [39]:
sc.pl.pca_variance_ratio(scanpyObj, log=True)
In [40]:
scanpyObj
Out[40]:
AnnData object with n_obs × n_vars = 10586 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Computing the neighborhood graph¶

In [41]:
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
    using 'X_pca' with n_pcs = 9
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)

Embedding the neighborhood graph¶

In [42]:
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:04)
Out[42]:
AnnData object with n_obs × n_vars = 10586 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [43]:
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [44]:
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.996754   0.99504185 0.9920966  0.9898147  0.9879345
     0.98735774 0.98407966 0.9831898  0.97786796 0.97119904 0.96953475
     0.9642359  0.9634919  0.9609128 ]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
In [45]:
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
    adata.uns['iroot'] = root_cell_index
    adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
    finished: added
 (0:00:00)
In [46]:
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [47]:
scanpyObj.X.max()
Out[47]:
20.0